Skip to content

gh-148276: Optimize object creation and method calls in the JIT by resolving __init__ at trace optimization time#148277

Open
eendebakpt wants to merge 4 commits intopython:mainfrom
eendebakpt:type_guard_elimination
Open

gh-148276: Optimize object creation and method calls in the JIT by resolving __init__ at trace optimization time#148277
eendebakpt wants to merge 4 commits intopython:mainfrom
eendebakpt:type_guard_elimination

Conversation

@eendebakpt
Copy link
Copy Markdown
Contributor

@eendebakpt eendebakpt commented Apr 8, 2026

Optimize object creation and method calls in the JIT by resolving __init__ at trace compile time and eliminating redundant type guards. The idea was picked up when experimenting with the ideas in #144388 using Claude Code.

Changes

  • _CHECK_AND_ALLOCATE_OBJECT: resolve the __init__ function to a constant via _spec_cache.init, allowing the optimizer to eliminate _CHECK_FUNCTION_VERSION and _CHECK_FUNCTION_EXACT_ARGS for the init call
  • _GUARD_TYPE_VERSION_LOCKED: propagate type version info so repeated guards on the same type within a trace are NOPed

Benchmark (release JIT, x86_64) on

class Point:
    def __init__(self, x, y):
        self.x = x
        self.y = y

    def distance_sq(self):
        return self.x * self.x + self.y * self.y

    def translate(self, dx, dy):
        return Point(self.x + dx, self.y + dy)
Benchmark main branch Speedup
Point(x, y) 79.8 ns 72.9 ns 1.09x
p.translate().dist() 157.7 ns 130.7 ns 1.21x
v.scale().add().dot() 399.3 ns 301.7 ns 1.32x

Object creation + method chains are 1.2-1.3x faster. Simple method calls and descriptors are unchanged.

Details<.summary>
"""Benchmark for type guard elimination and __init__ resolution.

Tests the optimizations from the type_guard_elimination branch:
1. __init__ function resolution in _CHECK_AND_ALLOCATE_OBJECT
2. Redundant _GUARD_TYPE_VERSION_LOCKED elimination

Usage:
    ./python bench_type_guard.py
    ./python bench_type_guard.py --trace   # show tier 2 traces
"""

import sys
import timeit

SHOW_TRACE = "--trace" in sys.argv

# --- System info ---
print("=" * 60)
print("Type Guard Elimination Benchmark")
print("=" * 60)
print(f"Python: {sys.version}")
print(f"Debug:  {hasattr(sys, 'gettotalrefcount')}")
jit = getattr(sys, "_jit", None)
if jit:
    print(f"JIT:    available={jit.is_available()}, enabled={jit.is_enabled()}")

tier2 = False
try:
    from _testinternalcapi import TIER2_THRESHOLD
    tier2 = True
    print(f"Tier 2: enabled (threshold={TIER2_THRESHOLD})")
except (ImportError, AttributeError):
    print("Tier 2: disabled")
print()


# --- Benchmark functions ---

class Point:
    def __init__(self, x, y):
        self.x = x
        self.y = y

    def distance_sq(self):
        return self.x * self.x + self.y * self.y

    def translate(self, dx, dy):
        return Point(self.x + dx, self.y + dy)


def bench_init(n):
    """Object creation: tests __init__ resolution."""
    total = 0.0
    for i in range(n):
        p = Point(1.0, 2.0)
        total += p.x
    return total


def bench_method_chain(n):
    """Method calls: tests type guard elimination across calls."""
    p = Point(1.0, 2.0)
    total = 0.0
    for i in range(n):
        total += p.distance_sq()
    return total


def bench_translate_chain(n):
    """Object creation + method: tests init + guard elimination."""
    p = Point(0.0, 0.0)
    total = 0.0
    for i in range(n):
        p = p.translate(1.0, 0.5)
        total += p.distance_sq()
    return total


class Vector:
    def __init__(self, x, y, z):
        self.x = x
        self.y = y
        self.z = z

    def dot(self, other):
        return self.x * other.x + self.y * other.y + self.z * other.z

    def scale(self, s):
        return Vector(self.x * s, self.y * s, self.z * s)

    def add(self, other):
        return Vector(self.x + other.x, self.y + other.y, self.z + other.z)


def bench_vector_ops(n):
    """Vector math: tests guard elimination + init across operations."""
    v1 = Vector(1.0, 2.0, 3.0)
    v2 = Vector(4.0, 5.0, 6.0)
    total = 0.0
    for i in range(n):
        v3 = v1.scale(2.0).add(v2)
        total += v3.dot(v1)
    return total


def bench_list_append(n):
    """list.append: tests method descriptor optimization."""
    result = []
    for i in range(n):
        result.append(i)
    return len(result)


def bench_str_method(n):
    """str.startswith: tests method descriptor fast path."""
    s = "hello world"
    count = 0
    for i in range(n):
        if s.startswith("hello"):
            count += 1
    return count


# --- Warmup ---
LOOP = 10_000
for fn in [bench_init, bench_method_chain, bench_translate_chain,
           bench_vector_ops, bench_list_append, bench_str_method]:
    fn(LOOP)


# --- Show traces ---
if SHOW_TRACE and tier2:
    from _opcode import get_executor

    print("-" * 60)
    print("Tier 2 Traces")
    print("-" * 60)

    for label, func in [
        ("bench_init", bench_init),
        ("bench_method_chain", bench_method_chain),
        ("bench_translate_chain", bench_translate_chain),
        ("bench_vector_ops", bench_vector_ops),
    ]:
        code = func.__code__
        found = False
        for i in range(len(code.co_code) // 2):
            try:
                ex = get_executor(code, i * 2)
            except (ValueError, TypeError, RuntimeError):
                continue
            if ex is None:
                continue

            print(f"\n  {label}:")
            for j, op in enumerate(ex):
                name = op[0]
                if any(k in name for k in (
                    "GUARD", "INIT", "CHECK", "ALLOCATE", "CALL",
                    "LOAD_ATTR", "PUSH_FRAME", "CREATE", "VERSION",
                    "NOP", "EXPAND", "METHOD",
                )):
                    marker = ""
                    if "NOP" in name and "GUARD" not in name:
                        marker = " ← eliminated"
                    print(f"    {j:3d}: {name}{marker}")
            found = True
            break

        if not found:
            print(f"\n  {label}: (no executor found)")
    print()


# --- Benchmark ---
print("-" * 60)
print("Benchmark (min of 3 runs)")
print("-" * 60)

N = 2_000_000
INNER = 1000

benchmarks = [
    ("Point(x, y)           (__init__) ", bench_init),
    ("p.distance_sq()       (method)   ", bench_method_chain),
    ("p.translate().dist()  (chain)    ", bench_translate_chain),
    ("v.scale().add().dot() (vector)   ", bench_vector_ops),
    ("list.append(i)        (descr)    ", bench_list_append),
    ("s.startswith()        (str meth) ", bench_str_method),
]

for label, fn in benchmarks:
    iters = N // INNER
    times = [timeit.timeit(lambda: fn(INNER), number=iters) for _ in range(3)]
    t = min(times)
    print(f"  {label}: {t/N*1e9:.1f} ns/iter")

print()

eendebakpt and others added 2 commits April 8, 2026 18:24
…nt type guards

- _CHECK_AND_ALLOCATE_OBJECT: resolve __init__ from type's _spec_cache
  so the optimizer can follow into __init__ bodies
- _GUARD_TYPE_VERSION_LOCKED: add optimizer handler to track type version
  and NOP redundant guards on the same object
- Add test_guard_type_version_locked_removed

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Copy link
Copy Markdown
Member

@Fidget-Spinner Fidget-Spinner left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM, just two comments on wording.

Comment on lines +1568 to +1569
enabling the optimizer to trace into the init frame and eliminate
redundant function version and arg count checks.
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This has nothing to do with tracing into the init frame. We already do that. it's more of propagating information through the frame

@Fidget-Spinner Fidget-Spinner changed the title gh-148276: Optimize object creation and method calls in the JIT by resolving __init__ at trace compile time gh-148276: Optimize object creation and method calls in the JIT by resolving __init__ at trace optimization time Apr 9, 2026
op(_GUARD_TYPE_VERSION_LOCKED, (type_version/2, owner -- owner)) {
assert(type_version);
if (sym_matches_type_version(owner, type_version)) {
ADD_OP(_NOP, 0, 0);
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should not be removing this as we are moving towards FT compatibility. This uop unlocks objects on FT as well, so we need to keep it around as it's side effecting.

Instead, you should break out the _GUARD_TYPE_VERSION_LOCKED into _GUARD_TYPE_VERSION + UNLOCK. See for example the _LOCK_OBJECT op.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I do not fully understand. The unlock only happens when the type version doesn't match. If that cannot happen, there is no need to keep the unlock part or is there?

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm renaming the opcode references from _GUARD_TYPE_VERSION_LOCKED to _GUARD_TYPE_VERSION but I'm not sure how to add the unlock part. I would like some help with that part.

eendebakpt and others added 2 commits April 10, 2026 23:11
Co-authored-by: Ken Jin <kenjin4096@gmail.com>
Copy link
Copy Markdown
Contributor

@MazinSharaf MazinSharaf left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just changing the opcode references from _GUARD_TYPE_VERSION_LOCKED to _GUARD_TYPE_VERSION. This is going according to the review on a comment in optimizer_bytecodes.c specifically R140. However, I have not fulfilled it fully, and I would just like some help in adding the rest of the changes, in terms of adding the unlock stuff.

@@ -134,6 +134,21 @@ dummy_func(void) {
assert(!PyJitRef_IsUnique(value));
}

op(_GUARD_TYPE_VERSION_LOCKED, (type_version/2, owner -- owner)) {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
op(_GUARD_TYPE_VERSION_LOCKED, (type_version/2, owner -- owner)) {
op(_GUARD_TYPE_VERSION, (type_version/2, owner -- owner)) {

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is in reference to the comment on R140

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's not the full solution, but it's going towards that direction.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Still need to add the unlock/deopt stuff, but I'm not sure how quite to go about it, I am still learning about the code here. Would be nice if someone could help me with that. Thanks!

@@ -2721,6 +2721,11 @@ dummy_func(
}

op(_GUARD_TYPE_VERSION_LOCKED, (type_version/2, owner -- owner)) {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
op(_GUARD_TYPE_VERSION_LOCKED, (type_version/2, owner -- owner)) {
op(_GUARD_TYPE_VERSION, (type_version/2, owner -- owner)) {

Refer to R137's comments and R140's comments on optimizer_bytecodes.c

@@ -2721,6 +2721,11 @@ dummy_func(
}

op(_GUARD_TYPE_VERSION_LOCKED, (type_version/2, owner -- owner)) {
// Guard that type version matches expected value. Object is assumed to be
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
// Guard that type version matches expected value. Object is assumed to be

@@ -2721,6 +2721,11 @@ dummy_func(
}

op(_GUARD_TYPE_VERSION_LOCKED, (type_version/2, owner -- owner)) {
// Guard that type version matches expected value. Object is assumed to be
// locked on entry. If version matches, lock is retained for subsequent
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
// locked on entry. If version matches, lock is retained for subsequent

@@ -2721,6 +2721,11 @@ dummy_func(
}

op(_GUARD_TYPE_VERSION_LOCKED, (type_version/2, owner -- owner)) {
// Guard that type version matches expected value. Object is assumed to be
// locked on entry. If version matches, lock is retained for subsequent
// operations. If mismatch, unlock and exit (deopt). This allows the JIT
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
// operations. If mismatch, unlock and exit (deopt). This allows the JIT

// locked on entry. If version matches, lock is retained for subsequent
// operations. If mismatch, unlock and exit (deopt). This allows the JIT
// optimizer to eliminate this guard entirely if type version is proven,
// in which case the lock is held for the entire trace duration.
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
// in which case the lock is held for the entire trace duration.

Foo.attr = 0
self.assertFalse(ex.is_valid())

def test_guard_type_version_locked_removed(self):
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
def test_guard_type_version_locked_removed(self):
def test_guard_type_version_removed(self):

Please refer to R137's and R140's comments on optimizer_bytecodes.c

op(_GUARD_TYPE_VERSION_LOCKED, (type_version/2, owner -- owner)) {
assert(type_version);
if (sym_matches_type_version(owner, type_version)) {
ADD_OP(_NOP, 0, 0);
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm renaming the opcode references from _GUARD_TYPE_VERSION_LOCKED to _GUARD_TYPE_VERSION but I'm not sure how to add the unlock part. I would like some help with that part.

res, ex = self._run_with_optimizer(thing, TIER2_THRESHOLD)
self.assertIsNotNone(ex)
opnames = list(iter_opnames(ex))
guard_locked_count = opnames.count("_GUARD_TYPE_VERSION_LOCKED")
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
guard_locked_count = opnames.count("_GUARD_TYPE_VERSION_LOCKED")
guard_count = opnames.count("_GUARD_TYPE_VERSION")


def test_guard_type_version_locked_removed(self):
"""
Verify that redundant _GUARD_TYPE_VERSION_LOCKED guards are
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
Verify that redundant _GUARD_TYPE_VERSION_LOCKED guards are
Verify that redundant _GUARD_TYPE_VERSION guards are

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

3 participants